# Aggregate by mode (sum across all purposes)
legacy_by_mode = legacy_matrix.sum(axis=1) # Sum across purposes for each mode
new_by_mode = new_matrix.sum(axis=1)
# Aggregate by purpose (sum across all modes)
legacy_by_purpose = legacy_matrix.sum(axis=0) # Sum across modes for each purpose
new_by_purpose = new_matrix.sum(axis=0)
from scipy import stats
# Calculate correlations
r_mode = stats.pearsonr(legacy_by_mode, new_by_mode)[0]
r_purpose = stats.pearsonr(legacy_by_purpose, new_by_purpose)[0]
# Create subplots for mode and purpose scatter plots
fig_scatter = make_subplots(
rows=1, cols=2,
subplot_titles=(
f'Mode Distribution Comparison<br><sub>R² = {r_mode**2:.4f}</sub>',
f'Purpose Distribution Comparison<br><sub>R² = {r_purpose**2:.4f}</sub>'
),
horizontal_spacing=0.12
)
# 1. Mode Scatter Plot
max_mode = max(legacy_by_mode.max(), new_by_mode.max())
fig_scatter.add_trace(
go.Scatter(
x=legacy_by_mode,
y=new_by_mode,
mode='markers',
marker=dict(size=12, opacity=0.7, color='steelblue'),
text=mode_labels,
hovertemplate='<b>%{text}</b><br>Legacy: %{x:,.0f}<br>New: %{y:,.0f}<extra></extra>',
name='Modes'
),
row=1, col=1
)
fig_scatter.add_trace(
go.Scatter(
x=[0, max_mode],
y=[0, max_mode],
mode='lines',
line=dict(color='red', dash='dash', width=1),
name='Perfect Match',
hoverinfo='skip',
showlegend=False
),
row=1, col=1
)
# 2. Purpose Scatter Plot
max_purpose = max(legacy_by_purpose.max(), new_by_purpose.max())
fig_scatter.add_trace(
go.Scatter(
x=legacy_by_purpose,
y=new_by_purpose,
mode='markers',
marker=dict(size=12, opacity=0.7, color='darkgreen'),
text=purpose_labels,
hovertemplate='<b>%{text}</b><br>Legacy: %{x:,.0f}<br>New: %{y:,.0f}<extra></extra>',
name='Purposes'
),
row=1, col=2
)
fig_scatter.add_trace(
go.Scatter(
x=[0, max_purpose],
y=[0, max_purpose],
mode='lines',
line=dict(color='red', dash='dash', width=1),
name='Perfect Match',
hoverinfo='skip',
showlegend=False
),
row=1, col=2
)
# Update axes
fig_scatter.update_xaxes(title_text='Legacy Tour Count', row=1, col=1)
fig_scatter.update_yaxes(title_text='New Tour Count', row=1, col=1)
fig_scatter.update_xaxes(title_text='Legacy Tour Count', row=1, col=2)
fig_scatter.update_yaxes(title_text='New Tour Count', row=1, col=2)
fig_scatter.update_layout(
height=500,
showlegend=False,
template='plotly_white',
hovermode='closest',
margin=dict(t=80, b=50, l=50, r=50)
)
fig_scatter.show()
# 2. Sankey Diagrams: Mode and Purpose Flow
# Use matched tours to show how classifications flow from legacy to new
# Mode Sankey
mode_flow = (
matched.group_by(['tmodetp_leg', 'tmodetp_new'])
.agg(pl.len().alias('count'))
.sort('count', descending=True)
)
# Create source/target lists for mode
mode_sources = []
mode_targets = []
mode_values = []
mode_labels = list(mode_names.values())
n_modes = len(mode_labels)
# Build node labels: Legacy modes first, then New modes
mode_node_labels = [f"Legacy: {label}" for label in mode_labels] + [f"New: {label}" for label in mode_labels]
for row in mode_flow.iter_rows(named=True):
legacy_mode = row['tmodetp_leg']
new_mode = row['tmodetp_new']
count = row['count']
# Find indices
legacy_idx = list(mode_names.keys()).index(legacy_mode)
new_idx = list(mode_names.keys()).index(new_mode) + n_modes
mode_sources.append(legacy_idx)
mode_targets.append(new_idx)
mode_values.append(count)
# Purpose Sankey (with semantic mapping for legacy)
matched_purpose = matched.with_columns(
pl.col("pdpurp_leg").replace_strict(legacy_to_semantic, default=pl.col("pdpurp_leg")).alias("pdpurp_leg_semantic")
)
purpose_flow = (
matched_purpose.group_by(['pdpurp_leg_semantic', 'pdpurp_new'])
.agg(pl.len().alias('count'))
.sort('count', descending=True)
)
# Create source/target lists for purpose
purpose_sources = []
purpose_targets = []
purpose_values = []
purpose_labels_list = [purpose_names[k] for k in sorted([k for k in purpose_names.keys() if k < 10])]
n_purposes = len(purpose_labels_list)
# Build node labels: Legacy purposes first, then New purposes
purpose_node_labels = [f"Legacy: {label}" for label in purpose_labels_list] + [f"New: {label}" for label in purpose_labels_list]
purpose_keys = sorted([k for k in purpose_names.keys() if k < 10])
for row in purpose_flow.iter_rows(named=True):
legacy_purpose = row['pdpurp_leg_semantic']
new_purpose = row['pdpurp_new']
count = row['count']
try:
legacy_idx = purpose_keys.index(legacy_purpose)
new_idx = purpose_keys.index(new_purpose) + n_purposes
purpose_sources.append(legacy_idx)
purpose_targets.append(new_idx)
purpose_values.append(count)
except ValueError:
continue
# Create Sankey diagrams
fig_sankey = make_subplots(
rows=1, cols=2,
subplot_titles=('Mode Classification Flow', 'Purpose Classification Flow'),
specs=[[{"type": "sankey"}, {"type": "sankey"}]],
horizontal_spacing=0.05
)
# Mode Sankey
fig_sankey.add_trace(
go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=mode_node_labels,
),
link=dict(
source=mode_sources,
target=mode_targets,
value=mode_values,
)
),
row=1, col=1
)
# Purpose Sankey
fig_sankey.add_trace(
go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=purpose_node_labels,
),
link=dict(
source=purpose_sources,
target=purpose_targets,
value=purpose_values,
)
),
row=1, col=2
)
fig_sankey.update_layout(
height=600,
margin=dict(t=80, b=50, l=50, r=50)
)
fig_sankey.show()
# Calculate summary statistics
total_legacy = int(legacy_matrix.sum())
total_new = int(new_matrix.sum())
total_diff = total_new - total_legacy
mean_abs_diff = float(np.mean(np.abs(diff_matrix)))
median_abs_diff = float(np.median(np.abs(diff_matrix)))
max_abs_diff = float(np.max(np.abs(diff_matrix)))
# Calculate Total Variation Distance (TVD) - normalized similarity metric
legacy_dist = legacy_matrix.flatten() / legacy_matrix.sum()
new_dist = new_matrix.flatten() / new_matrix.sum()
tvd = 0.5 * np.sum(np.abs(legacy_dist - new_dist))
# Calculate cosine similarity
from numpy.linalg import norm
cosine_sim = np.dot(legacy_matrix.flatten(), new_matrix.flatten()) / (norm(legacy_matrix.flatten()) * norm(new_matrix.flatten()))
from IPython.display import Markdown, display
display(Markdown(f"""
**Summary Statistics:**
- **Total Tours:** Legacy = {total_legacy:,} | New = {total_new:,} | Diff = {total_diff:+,}
- **Mean Absolute Difference:** {mean_abs_diff:.1f} tours per mode-purpose combo
- **Median Absolute Difference:** {median_abs_diff:.1f} tours
- **Max Absolute Difference:** {max_abs_diff:.0f} tours
- **Mode Correlation (R):** {r_mode:.4f} (R² = {r_mode**2:.4f})
- **Purpose Correlation (R):** {r_purpose:.4f} (R² = {r_purpose**2:.4f})
- **Total Variation Distance:** {tvd:.4f} (0 = identical, 1 = completely different)
- **Cosine Similarity:** {cosine_sim:.4f} (1 = identical pattern, 0 = orthogonal)
"""))